import pandas as pd
import matplotlib.pylab as plt
import numpy as np
import seaborn as sns
%matplotlib inline
filename = r"Asteroid_Updated.csv"
df = pd.read_csv(filename)
C:\Users\DELL\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3146: DtypeWarning: Columns (0,10,15,16,23,24) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
df.head(20)
name | a | e | i | om | w | q | ad | per_y | data_arc | ... | UB | IR | spec_B | spec_T | G | moid | class | n | per | ma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Ceres | 2.769165 | 0.076009 | 10.594067 | 80.305532 | 73.597694 | 2.558684 | 2.979647 | 4.608202 | 8822.0 | ... | 0.426 | NaN | C | G | 0.12 | 1.594780 | MBA | 0.213885 | 1683.145708 | 77.372096 |
1 | Pallas | 2.772466 | 0.230337 | 34.836234 | 173.080063 | 310.048857 | 2.133865 | 3.411067 | 4.616444 | 72318.0 | ... | 0.284 | NaN | B | B | 0.11 | 1.233240 | MBA | 0.213503 | 1686.155999 | 59.699133 |
2 | Juno | 2.669150 | 0.256942 | 12.988919 | 169.852760 | 248.138626 | 1.983332 | 3.354967 | 4.360814 | 72684.0 | ... | 0.433 | NaN | Sk | S | 0.32 | 1.034540 | MBA | 0.226019 | 1592.787285 | 34.925016 |
3 | Vesta | 2.361418 | 0.088721 | 7.141771 | 103.810804 | 150.728541 | 2.151909 | 2.570926 | 3.628837 | 24288.0 | ... | 0.492 | NaN | V | V | 0.32 | 1.139480 | MBA | 0.271609 | 1325.432765 | 95.861936 |
4 | Astraea | 2.574249 | 0.191095 | 5.366988 | 141.576605 | 358.687607 | 2.082324 | 3.066174 | 4.130323 | 63507.0 | ... | 0.411 | NaN | S | S | NaN | 1.095890 | MBA | 0.238632 | 1508.600458 | 282.366289 |
5 | Hebe | 2.425160 | 0.203007 | 14.737901 | 138.640203 | 239.807490 | 1.932835 | 2.917485 | 3.776755 | 62329.0 | ... | 0.399 | NaN | S | S | 0.24 | 0.973965 | MBA | 0.260972 | 1379.459705 | 86.197923 |
6 | Iris | 2.385334 | 0.231206 | 5.523651 | 259.563231 | 145.265106 | 1.833831 | 2.936837 | 3.684105 | 62452.0 | ... | 0.484 | NaN | S | S | NaN | 0.846100 | MBA | 0.267535 | 1345.619196 | 140.419656 |
7 | Flora | 2.201764 | 0.156499 | 5.886955 | 110.889330 | 285.287462 | 1.857190 | 2.546339 | 3.267115 | 62655.0 | ... | 0.489 | NaN | NaN | S | 0.28 | 0.874176 | MBA | 0.301681 | 1193.313717 | 194.882895 |
8 | Metis | 2.385637 | 0.123114 | 5.576816 | 68.908577 | 6.417369 | 2.091931 | 2.679342 | 3.684806 | 61821.0 | ... | 0.496 | NaN | NaN | S | 0.17 | 1.106910 | MBA | 0.267484 | 1345.875362 | 276.861623 |
9 | Hygiea | 3.141539 | 0.112461 | 3.831560 | 283.202167 | 312.315206 | 2.788240 | 3.494839 | 5.568291 | 62175.0 | ... | 0.351 | NaN | C | C | NaN | 1.778390 | MBA | 0.177007 | 2033.818284 | 152.184851 |
10 | Parthenope | 2.453109 | 0.100472 | 4.629886 | 125.546585 | 195.550396 | 2.206640 | 2.699579 | 3.842232 | 61755.0 | ... | 0.417 | NaN | Sk | S | NaN | 1.193220 | MBA | 0.256524 | 1403.375193 | 278.930692 |
11 | Victoria | 2.334315 | 0.220172 | 8.373074 | 235.410169 | 69.641819 | 1.820365 | 2.848265 | 3.566543 | 61769.0 | ... | 0.515 | NaN | L | S | 0.22 | 0.824953 | MBA | 0.276353 | 1302.679690 | 133.335892 |
12 | Egeria | 2.575981 | 0.085121 | 16.536125 | 43.221913 | 80.544823 | 2.356710 | 2.795252 | 4.134492 | 61680.0 | ... | 0.452 | NaN | Ch | G | NaN | 1.436330 | MBA | 0.238391 | 1510.123380 | 187.488522 |
13 | Irene | 2.585567 | 0.166582 | 9.121646 | 86.122665 | 97.858985 | 2.154858 | 3.016277 | 4.157593 | 61526.0 | ... | 0.388 | NaN | S | S | NaN | 1.179660 | MBA | 0.237067 | 1518.560847 | 164.935853 |
14 | Eunomia | 2.644100 | 0.186084 | 11.752430 | 292.934339 | 98.498681 | 2.152075 | 3.136126 | 4.299571 | 61247.0 | ... | 0.451 | NaN | S | S | 0.23 | 1.194850 | MBA | 0.229238 | 1570.418187 | 283.387698 |
15 | Psyche | 2.923814 | 0.133568 | 3.096005 | 150.045666 | 228.823071 | 2.533285 | 3.314343 | 4.999571 | 12856.0 | ... | 0.299 | NaN | X | M | 0.20 | 1.535800 | MBA | 0.197142 | 1826.093319 | 288.335893 |
16 | Thetis | 2.470354 | 0.133032 | 5.591205 | 125.552945 | 136.208250 | 2.141719 | 2.798989 | 3.882818 | 61117.0 | ... | 0.438 | NaN | Sl | S | NaN | 1.129810 | MBA | 0.253843 | 1418.199204 | 303.364363 |
17 | Melpomene | 2.296654 | 0.217674 | 10.128731 | 150.383862 | 227.950847 | 1.796731 | 2.796576 | 3.480578 | 60906.0 | ... | 0.425 | NaN | S | S | 0.25 | 0.813258 | MBA | 0.283179 | 1271.281262 | 267.254381 |
18 | Fortuna | 2.442711 | 0.158047 | 1.573782 | 211.144044 | 182.065018 | 2.056648 | 2.828773 | 3.817827 | 60970.0 | ... | 0.324 | NaN | Ch | G | 0.10 | 1.062130 | MBA | 0.258164 | 1394.461340 | 197.338626 |
19 | Massalia | 2.409782 | 0.142067 | 0.708751 | 206.108911 | 256.773196 | 2.067432 | 2.752132 | 3.740889 | 59461.0 | ... | 0.463 | NaN | S | S | 0.25 | 1.084610 | MBA | 0.263474 | 1366.359575 | 117.695129 |
20 rows × 31 columns
# replace missing values with NaN
df.replace(" ", np.nan, inplace = True)
df.tail(5)
name | a | e | i | om | w | q | ad | per_y | data_arc | ... | UB | IR | spec_B | spec_T | G | moid | class | n | per | ma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
839709 | NaN | 2.812945 | 0.664688 | 4.695700 | 183.310012 | 234.618352 | 0.943214 | 4.682676 | 4.717914 | 17298.0 | ... | NaN | NaN | NaN | NaN | NaN | 0.032397 | APO | 0.208911 | 1723.217927 | 156.905910 |
839710 | NaN | 2.645238 | 0.259376 | 12.574937 | 1.620020 | 339.568072 | 1.959126 | 3.331350 | 4.302346 | 16.0 | ... | NaN | NaN | NaN | NaN | NaN | 0.956145 | MBA | 0.229090 | 1571.431965 | 13.366251 |
839711 | NaN | 2.373137 | 0.202053 | 0.732484 | 176.499082 | 198.026527 | 1.893638 | 2.852636 | 3.655884 | 5.0 | ... | NaN | NaN | NaN | NaN | NaN | 0.893896 | MBA | 0.269600 | 1335.311579 | 355.351127 |
839712 | NaN | 2.260404 | 0.258348 | 9.661947 | 204.512448 | 148.496988 | 1.676433 | 2.844376 | 3.398501 | 10.0 | ... | NaN | NaN | NaN | NaN | NaN | 0.680220 | MBA | 0.290018 | 1241.302609 | 15.320134 |
839713 | NaN | 2.546442 | 0.287672 | 5.356238 | 70.709555 | 273.483265 | 1.813901 | 3.278983 | 4.063580 | 11.0 | ... | NaN | NaN | NaN | NaN | NaN | 0.815280 | MBA | 0.242551 | 1484.222588 | 20.432959 |
5 rows × 31 columns
missing_data = df.isnull()
missing_data.head(5)
name | a | e | i | om | w | q | ad | per_y | data_arc | ... | UB | IR | spec_B | spec_T | G | moid | class | n | per | ma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | False | False | ... | False | True | False | False | False | False | False | False | False | False |
1 | False | False | False | False | False | False | False | False | False | False | ... | False | True | False | False | False | False | False | False | False | False |
2 | False | False | False | False | False | False | False | False | False | False | ... | False | True | False | False | False | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | False | ... | False | True | False | False | False | False | False | False | False | False |
4 | False | False | False | False | False | False | False | False | False | False | ... | False | True | False | False | True | False | False | False | False | False |
5 rows × 31 columns
#Checking for missing values in each column
for column in missing_data.columns.values.tolist():
print(column)
print (missing_data[column].value_counts())
print("")
name True 817747 False 21967 Name: name, dtype: int64 a False 839712 True 2 Name: a, dtype: int64 e False 839714 Name: e, dtype: int64 i False 839714 Name: i, dtype: int64 om False 839714 Name: om, dtype: int64 w False 839714 Name: w, dtype: int64 q False 839714 Name: q, dtype: int64 ad False 839708 True 6 Name: ad, dtype: int64 per_y False 839713 True 1 Name: per_y, dtype: int64 data_arc False 824240 True 15474 Name: data_arc, dtype: int64 condition_code False 838847 True 867 Name: condition_code, dtype: int64 n_obs_used False 839714 Name: n_obs_used, dtype: int64 H False 837025 True 2689 Name: H, dtype: int64 neo False 839708 True 6 Name: neo, dtype: int64 pha False 823272 True 16442 Name: pha, dtype: int64 diameter True 702078 False 137636 Name: diameter, dtype: int64 extent True 839696 False 18 Name: extent, dtype: int64 albedo True 703305 False 136409 Name: albedo, dtype: int64 rot_per True 820918 False 18796 Name: rot_per, dtype: int64 GM True 839700 False 14 Name: GM, dtype: int64 BV True 838693 False 1021 Name: BV, dtype: int64 UB True 838735 False 979 Name: UB, dtype: int64 IR True 839713 False 1 Name: IR, dtype: int64 spec_B True 838048 False 1666 Name: spec_B, dtype: int64 spec_T True 838734 False 980 Name: spec_T, dtype: int64 G True 839595 False 119 Name: G, dtype: int64 moid False 823272 True 16442 Name: moid, dtype: int64 class False 839714 Name: class, dtype: int64 n False 839712 True 2 Name: n, dtype: int64 per False 839708 True 6 Name: per, dtype: int64 ma False 839706 True 8 Name: ma, dtype: int64
# Dropping all rows
df.dropna(subset=["diameter"], axis=0, inplace=True)
# reset index, because we some rows
df.reset_index(drop=True, inplace=True)
df.head()
name | a | e | i | om | w | q | ad | per_y | data_arc | ... | UB | IR | spec_B | spec_T | G | moid | class | n | per | ma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Ceres | 2.769165 | 0.076009 | 10.594067 | 80.305532 | 73.597694 | 2.558684 | 2.979647 | 4.608202 | 8822.0 | ... | 0.426 | NaN | C | G | 0.12 | 1.59478 | MBA | 0.213885 | 1683.145708 | 77.372096 |
1 | Pallas | 2.772466 | 0.230337 | 34.836234 | 173.080063 | 310.048857 | 2.133865 | 3.411067 | 4.616444 | 72318.0 | ... | 0.284 | NaN | B | B | 0.11 | 1.23324 | MBA | 0.213503 | 1686.155999 | 59.699133 |
2 | Juno | 2.669150 | 0.256942 | 12.988919 | 169.852760 | 248.138626 | 1.983332 | 3.354967 | 4.360814 | 72684.0 | ... | 0.433 | NaN | Sk | S | 0.32 | 1.03454 | MBA | 0.226019 | 1592.787285 | 34.925016 |
3 | Vesta | 2.361418 | 0.088721 | 7.141771 | 103.810804 | 150.728541 | 2.151909 | 2.570926 | 3.628837 | 24288.0 | ... | 0.492 | NaN | V | V | 0.32 | 1.13948 | MBA | 0.271609 | 1325.432765 | 95.861936 |
4 | Astraea | 2.574249 | 0.191095 | 5.366988 | 141.576605 | 358.687607 | 2.082324 | 3.066174 | 4.130323 | 63507.0 | ... | 0.411 | NaN | S | S | NaN | 1.09589 | MBA | 0.238632 | 1508.600458 | 282.366289 |
5 rows × 31 columns
df.dtypes #Checking datatypes
name object a float64 e float64 i float64 om float64 w float64 q float64 ad float64 per_y float64 data_arc float64 condition_code object n_obs_used int64 H float64 neo object pha object diameter object extent object albedo float64 rot_per float64 GM float64 BV float64 UB float64 IR float64 spec_B object spec_T object G float64 moid float64 class object n float64 per float64 ma float64 dtype: object
df[["diameter"]] = df[["diameter"]].astype(float) #Converting diameter datatype to float from string
df["diameter"].dtypes
dtype('float64')
#Deleting all columns with insufficient/irrelevant data
del df['name']
del df['extent']
del df['rot_per']
del df['GM']
del df['BV']
del df['UB']
del df['IR']
del df['spec_B']
del df['spec_T']
del df['G']
df.head()
a | e | i | om | w | q | ad | per_y | data_arc | condition_code | ... | H | neo | pha | diameter | albedo | moid | class | n | per | ma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.769165 | 0.076009 | 10.594067 | 80.305532 | 73.597694 | 2.558684 | 2.979647 | 4.608202 | 8822.0 | 0 | ... | 3.34 | N | N | 939.400 | 0.0900 | 1.59478 | MBA | 0.213885 | 1683.145708 | 77.372096 |
1 | 2.772466 | 0.230337 | 34.836234 | 173.080063 | 310.048857 | 2.133865 | 3.411067 | 4.616444 | 72318.0 | 0 | ... | 4.13 | N | N | 545.000 | 0.1010 | 1.23324 | MBA | 0.213503 | 1686.155999 | 59.699133 |
2 | 2.669150 | 0.256942 | 12.988919 | 169.852760 | 248.138626 | 1.983332 | 3.354967 | 4.360814 | 72684.0 | 0 | ... | 5.33 | N | N | 246.596 | 0.2140 | 1.03454 | MBA | 0.226019 | 1592.787285 | 34.925016 |
3 | 2.361418 | 0.088721 | 7.141771 | 103.810804 | 150.728541 | 2.151909 | 2.570926 | 3.628837 | 24288.0 | 0 | ... | 3.20 | N | N | 525.400 | 0.4228 | 1.13948 | MBA | 0.271609 | 1325.432765 | 95.861936 |
4 | 2.574249 | 0.191095 | 5.366988 | 141.576605 | 358.687607 | 2.082324 | 3.066174 | 4.130323 | 63507.0 | 0 | ... | 6.85 | N | N | 106.699 | 0.2740 | 1.09589 | MBA | 0.238632 | 1508.600458 | 282.366289 |
5 rows × 21 columns
df.dtypes
a float64 e float64 i float64 om float64 w float64 q float64 ad float64 per_y float64 data_arc float64 condition_code object n_obs_used int64 H float64 neo object pha object diameter float64 albedo float64 moid float64 class object n float64 per float64 ma float64 dtype: object
#Renaming all columns to make columns meaning more understandable
df.columns = ['semi_major_axis', 'eccentricity','Inclination','Longitude','perihelion_arg','perihelion_dis','aphelion_dist','Orbital_period_per_y','data_arc','condition_code','n_obs_used','Abs_Magnitude','neo','pha','diameter','albedo','moid','class','Mean_motion','orbital_Period_per','Mean_anomaly' ]
df.head()
semi_major_axis | eccentricity | Inclination | Longitude | perihelion_arg | perihelion_dis | aphelion_dist | Orbital_period_per_y | data_arc | condition_code | ... | Abs_Magnitude | neo | pha | diameter | albedo | moid | class | Mean_motion | orbital_Period_per | Mean_anomaly | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.769165 | 0.076009 | 10.594067 | 80.305532 | 73.597694 | 2.558684 | 2.979647 | 4.608202 | 8822.0 | 0 | ... | 3.34 | N | N | 939.400 | 0.0900 | 1.59478 | MBA | 0.213885 | 1683.145708 | 77.372096 |
1 | 2.772466 | 0.230337 | 34.836234 | 173.080063 | 310.048857 | 2.133865 | 3.411067 | 4.616444 | 72318.0 | 0 | ... | 4.13 | N | N | 545.000 | 0.1010 | 1.23324 | MBA | 0.213503 | 1686.155999 | 59.699133 |
2 | 2.669150 | 0.256942 | 12.988919 | 169.852760 | 248.138626 | 1.983332 | 3.354967 | 4.360814 | 72684.0 | 0 | ... | 5.33 | N | N | 246.596 | 0.2140 | 1.03454 | MBA | 0.226019 | 1592.787285 | 34.925016 |
3 | 2.361418 | 0.088721 | 7.141771 | 103.810804 | 150.728541 | 2.151909 | 2.570926 | 3.628837 | 24288.0 | 0 | ... | 3.20 | N | N | 525.400 | 0.4228 | 1.13948 | MBA | 0.271609 | 1325.432765 | 95.861936 |
4 | 2.574249 | 0.191095 | 5.366988 | 141.576605 | 358.687607 | 2.082324 | 3.066174 | 4.130323 | 63507.0 | 0 | ... | 6.85 | N | N | 106.699 | 0.2740 | 1.09589 | MBA | 0.238632 | 1508.600458 | 282.366289 |
5 rows × 21 columns
avg_Abs_Magnitude = df['Abs_Magnitude'].astype('float').mean(axis=0)
print("Abs_Magnitude Mean:", avg_Abs_Magnitude)
Abs_Magnitude Mean: 15.177041267011587
df['Abs_Magnitude'].replace(np.nan, avg_Abs_Magnitude, inplace=True)
avg_data_arc = df['data_arc'].astype('float').mean(axis=0)
avg_data_arc = int(avg_data_arc)
print("avg_data_arc Mean:", avg_data_arc)
avg_data_arc Mean: 8969
df['data_arc'].replace(np.nan, avg_data_arc, inplace=True)
avg_albedo = df['albedo'].astype('float').mean(axis=0)
print("avg_albedo Mean:", avg_albedo)
avg_albedo Mean: 0.13006564520622568
df['albedo'].replace(np.nan, avg_albedo, inplace=True)
missing_data = df.isnull()
missing_data.head(5)
for column in missing_data.columns.values.tolist():
print(column)
print (missing_data[column].value_counts())
print("")
semi_major_axis False 137636 Name: semi_major_axis, dtype: int64 eccentricity False 137636 Name: eccentricity, dtype: int64 Inclination False 137636 Name: Inclination, dtype: int64 Longitude False 137636 Name: Longitude, dtype: int64 perihelion_arg False 137636 Name: perihelion_arg, dtype: int64 perihelion_dis False 137636 Name: perihelion_dis, dtype: int64 aphelion_dist False 137636 Name: aphelion_dist, dtype: int64 Orbital_period_per_y False 137636 Name: Orbital_period_per_y, dtype: int64 data_arc False 137636 Name: data_arc, dtype: int64 condition_code False 137636 Name: condition_code, dtype: int64 n_obs_used False 137636 Name: n_obs_used, dtype: int64 Abs_Magnitude False 137636 Name: Abs_Magnitude, dtype: int64 neo False 137636 Name: neo, dtype: int64 pha False 137636 Name: pha, dtype: int64 diameter False 137636 Name: diameter, dtype: int64 albedo False 137636 Name: albedo, dtype: int64 moid False 137636 Name: moid, dtype: int64 class False 137636 Name: class, dtype: int64 Mean_motion False 137636 Name: Mean_motion, dtype: int64 orbital_Period_per False 137636 Name: orbital_Period_per, dtype: int64 Mean_anomaly False 137636 Name: Mean_anomaly, dtype: int64
#This will convert the dataframe into a csv file named 'astroid_cleansed'
#df.to_csv('astroid_cleansed.csv')
df.corr()
semi_major_axis | eccentricity | Inclination | Longitude | perihelion_arg | perihelion_dis | aphelion_dist | Orbital_period_per_y | data_arc | n_obs_used | Abs_Magnitude | diameter | albedo | moid | Mean_motion | orbital_Period_per | Mean_anomaly | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
semi_major_axis | 1.000000 | 0.021343 | 0.148705 | -0.000675 | -0.002379 | 0.366541 | 0.986158 | 0.941359 | -0.019646 | -0.049336 | -0.132610 | 0.144736 | -0.110168 | 0.369826 | -0.279518 | 0.941359 | 0.014283 |
eccentricity | 0.021343 | 1.000000 | 0.144882 | -0.000398 | 0.012354 | -0.514929 | 0.114209 | 0.048275 | -0.028362 | -0.076498 | 0.198676 | -0.049133 | -0.019264 | -0.490871 | 0.189318 | 0.048275 | -0.018082 |
Inclination | 0.148705 | 0.144882 | 1.000000 | -0.012863 | -0.004291 | 0.085951 | 0.141043 | 0.096304 | -0.199472 | -0.224616 | -0.033771 | 0.052609 | -0.089066 | 0.125247 | -0.108291 | 0.096304 | 0.015262 |
Longitude | -0.000675 | -0.000398 | -0.012863 | 1.000000 | -0.106991 | -0.002965 | -0.000181 | 0.000367 | -0.000501 | -0.023611 | 0.002801 | 0.001164 | 0.000739 | -0.003694 | 0.008053 | 0.000367 | -0.003253 |
perihelion_arg | -0.002379 | 0.012354 | -0.004291 | -0.106991 | 1.000000 | -0.006678 | -0.001312 | -0.001621 | -0.005570 | 0.010236 | -0.008439 | 0.002966 | -0.003027 | -0.006532 | 0.002438 | -0.001621 | 0.001528 |
perihelion_dis | 0.366541 | -0.514929 | 0.085951 | -0.002965 | -0.006678 | 1.000000 | 0.207199 | 0.109322 | -0.017597 | -0.080338 | -0.374466 | 0.329703 | -0.261687 | 0.996821 | -0.706225 | 0.109322 | 0.071083 |
aphelion_dist | 0.986158 | 0.114209 | 0.141043 | -0.000181 | -0.001312 | 0.207199 | 1.000000 | 0.970338 | -0.017522 | -0.037559 | -0.072703 | 0.093430 | -0.069204 | 0.211219 | -0.168049 | 0.970338 | 0.002351 |
Orbital_period_per_y | 0.941359 | 0.048275 | 0.096304 | 0.000367 | -0.001621 | 0.109322 | 0.970338 | 1.000000 | -0.007837 | -0.011021 | -0.035556 | 0.048953 | -0.019492 | 0.110897 | -0.059904 | 1.000000 | -0.005284 |
data_arc | -0.019646 | -0.028362 | -0.199472 | -0.000501 | -0.005570 | -0.017597 | -0.017522 | -0.007837 | 1.000000 | 0.755441 | -0.670998 | 0.491580 | 0.254523 | -0.025349 | 0.039911 | -0.007837 | -0.017377 |
n_obs_used | -0.049336 | -0.076498 | -0.224616 | -0.023611 | 0.010236 | -0.080338 | -0.037559 | -0.011021 | 0.755441 | 1.000000 | -0.782006 | 0.385747 | 0.448285 | -0.090066 | 0.127482 | -0.011021 | -0.047139 |
Abs_Magnitude | -0.132610 | 0.198676 | -0.033771 | 0.002801 | -0.008439 | -0.374466 | -0.072703 | -0.035556 | -0.670998 | -0.782006 | 1.000000 | -0.568493 | -0.240790 | -0.370004 | 0.328595 | -0.035556 | -0.006041 |
diameter | 0.144736 | -0.049133 | 0.052609 | 0.001164 | 0.002966 | 0.329703 | 0.093430 | 0.048953 | 0.491580 | 0.385747 | -0.568493 | 1.000000 | -0.107334 | 0.332423 | -0.201023 | 0.048953 | 0.009659 |
albedo | -0.110168 | -0.019264 | -0.089066 | 0.000739 | -0.003027 | -0.261687 | -0.069204 | -0.019492 | 0.254523 | 0.448285 | -0.240790 | -0.107334 | 1.000000 | -0.266281 | 0.343051 | -0.019492 | -0.044653 |
moid | 0.369826 | -0.490871 | 0.125247 | -0.003694 | -0.006532 | 0.996821 | 0.211219 | 0.110897 | -0.025349 | -0.090066 | -0.370004 | 0.332423 | -0.266281 | 1.000000 | -0.692139 | 0.110897 | 0.072123 |
Mean_motion | -0.279518 | 0.189318 | -0.108291 | 0.008053 | 0.002438 | -0.706225 | -0.168049 | -0.059904 | 0.039911 | 0.127482 | 0.328595 | -0.201023 | 0.343051 | -0.692139 | 1.000000 | -0.059904 | -0.063535 |
orbital_Period_per | 0.941359 | 0.048275 | 0.096304 | 0.000367 | -0.001621 | 0.109322 | 0.970338 | 1.000000 | -0.007837 | -0.011021 | -0.035556 | 0.048953 | -0.019492 | 0.110897 | -0.059904 | 1.000000 | -0.005284 |
Mean_anomaly | 0.014283 | -0.018082 | 0.015262 | -0.003253 | 0.001528 | 0.071083 | 0.002351 | -0.005284 | -0.017377 | -0.047139 | -0.006041 | 0.009659 | -0.044653 | 0.072123 | -0.063535 | -0.005284 | 1.000000 |
#Checking correlation of diameter with other features
df[df.columns[1:]].corr()['diameter'][:]
eccentricity -0.049133 Inclination 0.052609 Longitude 0.001164 perihelion_arg 0.002966 perihelion_dis 0.329703 aphelion_dist 0.093430 Orbital_period_per_y 0.048953 data_arc 0.491580 n_obs_used 0.385747 Abs_Magnitude -0.568493 diameter 1.000000 albedo -0.107334 moid 0.332423 Mean_motion -0.201023 orbital_Period_per 0.048953 Mean_anomaly 0.009659 Name: diameter, dtype: float64
df[["n_obs_used", "diameter"]].corr()
n_obs_used | diameter | |
---|---|---|
n_obs_used | 1.000000 | 0.385747 |
diameter | 0.385747 | 1.000000 |
f, ax = plt.subplots(figsize=(12, 5))
c1 = sns.regplot(x="diameter", y="eccentricity", data=df, ax=ax)
plt.ylim(0,)
c1.set_title('Eccentricity Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Eccentricity Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c2 = sns.regplot(x="diameter", y="Inclination", data=df, ax=ax)
plt.ylim(0,)
c2.set_title('Inclination Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Inclination Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c3 = sns.regplot(x="diameter", y="Longitude", data=df, ax=ax)
plt.ylim(0,)
c3.set_title('Longitude Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Longitude Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c4 = sns.regplot(x="diameter", y="perihelion_arg", data=df, ax=ax)
plt.ylim(0,)
c4.set_title('Perihelion_arg Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Perihelion_arg Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c5 = sns.regplot(x="diameter", y="perihelion_dis", data=df, ax=ax)
plt.ylim(0,)
c5.set_title('Periherlion_dis Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Periherlion_dis Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c6 = sns.regplot(x="diameter", y="aphelion_dist", data=df, ax=ax)
plt.ylim(0,)
c6.set_title('Aphelion_dist Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Aphelion_dist Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c7 = sns.regplot(x="diameter", y="Orbital_period_per_y", data=df, ax=ax)
plt.ylim(0,)
c7.set_title('Orbital_Period_per_y Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Orbital_Period_per_y Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c8 = sns.regplot(x="diameter", y="data_arc", data=df, ax=ax)
plt.ylim(0,)
c8.set_title('Data_arc Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Data_arc Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c9 = sns.regplot(x="diameter", y="n_obs_used", data=df, ax=ax)
plt.ylim(0,)
c9.set_title('n_obs_used Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'n_obs_used Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
sns.regplot(x="diameter", y="Abs_Magnitude", data=df, ax=ax)
plt.ylim(0,)
c10.set_title('Abs_Magnitude Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-38-e9040efe2f6e> in <module> 2 sns.regplot(x="diameter", y="Abs_Magnitude", data=df, ax=ax) 3 plt.ylim(0,) ----> 4 c10.set_title('Abs_Magnitude Correlation with Diameter', fontdict={'fontsize':18}, pad=16) NameError: name 'c10' is not defined
f, ax = plt.subplots(figsize=(12, 5))
c11 = sns.regplot(x="diameter", y="albedo", data=df, ax=ax)
plt.ylim(0,)
c11.set_title('albedo Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'albedo Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c12 = sns.regplot(x="diameter", y="moid", data=df, ax=ax)
plt.ylim(0,)
c12.set_title('Moid Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Moid Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c13 = sns.regplot(x="diameter", y="Mean_motion", data=df, ax=ax)
plt.ylim(0,)
c13.set_title('Mean_motion Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Mean_motion Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c14 = sns.regplot(x="diameter", y="Mean_anomaly", data=df, ax=ax)
plt.ylim(0,)
c14.set_title('Mean_Anomaly Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Mean_Anomaly Correlation with Diameter')
f, ax = plt.subplots(figsize=(12, 5))
c15 = sns.regplot(x="diameter", y="orbital_Period_per", data=df, ax=ax)
plt.ylim(0,)
c15.set_title('Eccentricity Correlation with Diameter', fontdict={'fontsize':18}, pad=16)
Text(0.5, 1.0, 'Eccentricity Correlation with Diameter')
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)
#plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')
Text(0.5, 1.0, 'Correlation Heatmap')
np.triu(np.ones_like(df.corr()))
plt.figure(figsize=(16, 6))
mask = np.triu(np.ones_like(df.corr(), dtype=np.bool))
heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);
plt.figure(figsize=(4, 10))
heatmap = sns.heatmap(df.corr()[['diameter']].sort_values(by='diameter', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Diameter', fontdict={'fontsize':18}, pad=16)
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')
Text(0.5, 1.0, 'Features Correlating with Diameter')